Note: Open scripts.Rproj first, then script. To easily use relative paths, click the down button next to knit and then click “Knit Directory –> Project Directory”. This should make loading and saving files much easier.
Perform Differential Gene Expression (DGE) analysis using a read count table (usually from Salmon) and a sample metadata sheet (user made).
Red text indicates regions that require the user to modify. Regardless, the user should check over all code blocks to ensure that everything is running correctly.
Load packages.
library(tidyverse)
library(maditr)
#install.packages("remotes")
#BiocManager::install(c("GO.db", "AnnotationDbi", "impute", "preprocessCore"))
#install.packages("WGCNA")
#remotes::install_github("andymckenzie/DGCA")
library(DGCA)
library(dynamicTreeCut)
library(igraph)
library(ape)
library(gplots)
library(viridis)
Change file names and condition variables where appropriate.
# Input metadata
metadata.file <- "mag_metadata.tsv"
# Input coverage file
cov.file <- "coverage.tsv.gz"
# Output files
prefix <- "coverage"
output.edges <- paste(prefix, ".edges.tsv", sep = "")
output.nodeinfo <- paste(prefix, ".nodeInfo.tsv", sep = "")
Load coverm genome MAG coverage results - extract
coverage values from coverm genome results file and format
into a matrix for plotting.
Required for network analysis: data Matrix: Row:MAG_ID
<-> Column:Sample_ID
cov <- read.table(cov.file, sep='\t',
header=TRUE, check.names=FALSE, stringsAsFactors=FALSE)
# Build matrix using: Group_ID<->sample
data <- cov %>%
dcast(Genome ~ Sample, value.var = "TPM") %>%
filter(!Genome %in% c("unmapped")) %>%
column_to_rownames("Genome")
head(data)
## CreekBiofilm_1 CreekBiofilm_2 CreekBiofilm_3
## Cmer_10D 59164.360166 51895.223558 52982.83324
## Cmer_10D_chloroplast 570703.804493 551719.427109 544643.87894
## Cmer_10D_mitochondria 206003.114581 265873.104334 250798.79165
## Gsulp_YNP5587_1_chloroplast 62.940380 61.148588 156.84956
## Gsulp_YNP5587_1_mitochondria 32.180662 34.751753 69.98135
## Gsulp_YNP5587_1_nuclear 5.815226 4.457809 12.73132
## CreekBiofilm_4 Endolithic_1 Endolithic_2
## Cmer_10D 51989.007741 19046.8077 43632.7904
## Cmer_10D_chloroplast 582509.100437 160358.5184 441459.9538
## Cmer_10D_mitochondria 275171.041489 72455.2646 176981.0170
## Gsulp_YNP5587_1_chloroplast 57.196154 49433.5778 11611.5529
## Gsulp_YNP5587_1_mitochondria 42.830015 17173.2623 5048.9684
## Gsulp_YNP5587_1_nuclear 4.144556 998.2967 247.4212
## Endolithic_3 Endolithic_4 Soil_1 Soil_2
## Cmer_10D 45807.246 41867.230 24684.882 23025.133
## Cmer_10D_chloroplast 289676.845 229510.501 256330.401 151098.773
## Cmer_10D_mitochondria 133427.262 76968.379 63105.606 39496.232
## Gsulp_YNP5587_1_chloroplast 35992.987 54231.372 87719.135 123828.985
## Gsulp_YNP5587_1_mitochondria 14692.334 43071.855 59505.202 99061.646
## Gsulp_YNP5587_1_nuclear 1072.992 2916.658 1763.304 2788.818
## Soil_3 Soil_4
## Cmer_10D 25346.711 35772.55
## Cmer_10D_chloroplast 190385.014 223184.59
## Cmer_10D_mitochondria 81612.796 67010.84
## Gsulp_YNP5587_1_chloroplast 80036.722 74719.21
## Gsulp_YNP5587_1_mitochondria 37874.829 50874.65
## Gsulp_YNP5587_1_nuclear 2196.999 2399.31
Load metdata file. Also add colors to metadata for plotting.
Required for network analysis: metadata Matrix: Row:MAG
ID <-> Column:metadata for MAG - Requires MAG_ID,
Label and Label_color columns - Requires
MAG_ID to be row names - Requires MAG_ID to be
first column
metadata <- read.table(metadata.file, sep='\t', header=TRUE, check.names=FALSE, stringsAsFactors=FALSE, comment.char='')
head(metadata)
## MAG_ID Domain Realm Kingdom
## 1 Cmer_10D d__Eukaryota r__Eukaryota k__Eukaryota
## 2 Cmer_10D_chloroplast d__Eukaryota r__Eukaryota k__Eukaryota
## 3 Cmer_10D_mitochondria d__Eukaryota r__Eukaryota k__Eukaryota
## 4 Gsulp_YNP5587_1_nuclear d__Eukaryota r__Eukaryota k__Eukaryota
## 5 Gsulp_YNP5587_1_chloroplast d__Eukaryota r__Eukaryota k__Eukaryota
## 6 Gsulp_YNP5587_1_mitochondria d__Eukaryota r__Eukaryota k__Eukaryota
## Phylum Class Order Family
## 1 p__Rhodophyta c__Bangiophyceae o__Cyanidiales f__Cyanidiaceae
## 2 p__Rhodophyta c__Bangiophyceae o__Cyanidiales f__Cyanidiaceae
## 3 p__Rhodophyta c__Bangiophyceae o__Cyanidiales f__Cyanidiaceae
## 4 p__Rhodophyta c__Bangiophyceae o__Galdieriales f__Galdieriaceae
## 5 p__Rhodophyta c__Bangiophyceae o__Galdieriales f__Galdieriaceae
## 6 p__Rhodophyta c__Bangiophyceae o__Galdieriales f__Galdieriaceae
## Genus Species Label Label_color
## 1 g__Cyanidioschyzon s__merolae Cmer_10D #e31a1c
## 2 g__Cyanidioschyzon s__merolae Cmer_10D_chloroplast #ed6b6d
## 3 g__Cyanidioschyzon s__merolae Cmer_10D_mitochondria #f7bcbd
## 4 g__Galdieria s__sulphurarius Gsulp_YNP5587_1_nuclear #ff7f00
## 5 g__Galdieria s__sulphurarius Gsulp_YNP5587_1_chloroplast #ffa955
## 6 g__Galdieria s__sulphurarius Gsulp_YNP5587_1_mitochondria #ffd3a9
Also calculate significance (p- and adjusted p-) values for each edge (correlation between two pairs).
# 1. Transpose CoverM results
# - Convert from Rows:Genomes/MAGs & Columns:Samples -> Rows:Samples & Columns:Genomes/MAGs
# 2. Z-Scale
# - Scale values for each Genome/MAG across Samples (relative change in the abundance of these Genomes/MAGs across samples)
t.data <- data %>% t() %>% scale()
# Generate correlation matrix using scalled values
cor.data <- matCorr(t.data, corrType="pearson")
# Conver from a N x M matrix to a pairwise list with NxM rows
pairs.cor.data <- data.frame(n1=rownames(cor.data)[row(cor.data)],
n2=colnames(cor.data)[col(cor.data)],
cor=c(cor.data))
# Extract row and column names. Used to add these labels to matrix objects created later.
cor.data.row <- rownames(cor.data)
cor.data.col <- colnames(cor.data)
# Number of Samples per correlation value (i.e., number of Samples per Genome that were used for the correlation analysis - will be the same for all Genomes)
nsample.data <- matNSamp(t.data)
# Generate significance values for each correlation value.
cor.data.pval <- matCorSig(cor.data, nsample.data)
colnames(cor.data.pval) <- cor.data.col
rownames(cor.data.pval) <- cor.data.row
# Conver from a N x M matrix to a pairwise list with NxM rows
pairsPval <- data.frame(n1=rownames(cor.data.pval)[row(cor.data.pval)],
n2=colnames(cor.data.pval)[col(cor.data.pval)],
pval=c(cor.data.pval))
# Generate adjusted significance values
cor.data.pval.vec <- as.vector(cor.data.pval)
cor.data.adjPval <- adjustPVals(cor.data.pval.vec, adjust="BH")
# Reformat adjusted significance values and reformat vector into a matrix
cor.data.adjPval <- as.numeric(format.pval(cor.data.adjPval, digits=2, nsmall=3)) # Will produce 'Warning: NAs introduced by coercion'
## Warning: NAs introduced by coercion
dim(cor.data.adjPval) <- dim(cor.data.pval)
colnames(cor.data.adjPval) <- cor.data.col
rownames(cor.data.adjPval) <- cor.data.row
# Conver from a N x M matrix to a pairwise list with NxM rows
pairsAdjPval <- data.frame(n1=rownames(cor.data.adjPval)[row(cor.data.adjPval)],
n2=colnames(cor.data.adjPval)[col(cor.data.adjPval)],
adjPval=c(cor.data.adjPval))
# Join correlation values, p-values, and adjusted p-values together into a single table.
cor.data.val <- cbind(pairs.cor.data, pval=pairsPval$pval, adjPval=pairsAdjPval$adjPval)
# Remove rows with missing values.
cor.data.val.final <- cor.data.val[complete.cases(cor.data.val),]
# Filter rows by adjusted p-value cutoff.
maxPval <- 0.05
cor.data.val.final.filtered <- cor.data.val.final[cor.data.val.final$adjPval <= maxPval,]
# Write correlation and significance values for each pair of Genomes.
write.table(cor.data.val.final.filtered, file=output.edges, quote=FALSE, sep="\t", row.names=FALSE)
Identify modules in network using the cutreeDynamicTree
function.
# Identify modules using hclust and cutreeDynamicTree
hclust_tree<-hclust(as.dist(1-cor.data), method="complete")
module_labels <- cutreeDynamicTree(dendro=hclust_tree, minModuleSize=20, deepSplit=TRUE)
# Create a data.frame with the module number for each Genome
data.modules <- as.data.frame(module_labels)
data.modules$MAG_ID <- rownames(cor.data)
# Add column with a difference color for each module
Colors <- rainbow(max(module_labels)+1)
data.modules$module_labels_colors <- Colors[data.modules$module_labels+1]
Create igraph object for network stats calculations.
# Convert data.frame of network edges into igraph object (assumes first column is IDs to match)
network <- graph_from_data_frame(cor.data.val.final.filtered,
vertices = metadata,
directed = FALSE)
Identify modules using a different betweenness-based approach.
# Identify modules
dendrogram <- cluster_edge_betweenness(network)
plot_dendrogram(dendrogram)
# Extract module info for each MAG
t <- membership(dendrogram)
cluster_edge_betweenness <- data.frame(MAG_ID=names(t))
cluster_edge_betweenness$module_labels_2 <- t
# Assign a color to each module
mod.list <- unique(cluster_edge_betweenness$module_labels_2)
Cols=rep(rev(brewer.pal(12,"Paired")), ceiling(length(mod.list)/12))
Cols=colorRampPalette(Cols)(length(mod.list))
names(Cols) <- mod.list
print(paste(length(mod.list), "modules identified."))
# Add colors to module list
cluster_edge_betweenness$module_labels_colors_2 <- Cols[cluster_edge_betweenness$module_labels_2]
See: https://www.datacamp.com/tutorial/centrality-network-analysis-R
# Empty data.frame for results.
network.centrality <- data.frame(MAG_ID=names(V(network)))
# Calculate the degree of centrality for each node (take just the results)
network.centrality$degree <- centr_degree(network, mode = "all")$res
# Calculate the closeness for each node
network.centrality$closeness <- closeness(network, mode = "all")
See: https://ona-book.org/similarity.html Description of assortativity values: https://ona-book.org/similarity.html#assortativity-in-networks “The assortativity coefficient of a graph is a measure of the extent to which vertices with the same properties connect to each other” -1 = dissortive 0 = random +1 = assortive
assortativity_nominal(network,
as.integer(as.factor(V(network)$Label)),
directed=FALSE)
## [1] 0.03362965
assortativity_degree(network, directed = FALSE)
## [1] 0.3699796
Combine all information about a node (Genome/MAG/etc.) together into a single metadata data.frame.
# Get sum of values across all samples for each Genome.
data.rowsum <- data.frame("Row_Sum" = rowSums(data))
data.rowsum$MAG_ID <- rownames(data.rowsum)
# Row sum to log2 and log10.
data.rowsum$Row_Sum_log2 <- log2(data.rowsum$Row_Sum)
data.rowsum$Row_Sum_log10 <- log10(data.rowsum$Row_Sum)
# Join metadata files together.
metadata.combined <- metadata
# Join species taxonomy and completness info with row sums.
metadata.combined <- merge(metadata.combined, data.rowsum, by.x="MAG_ID", by.y="MAG_ID")
# Join existing metadata.combined with modules identified
metadata.combined <- merge(metadata.combined, data.modules, by.x="MAG_ID", by.y="MAG_ID")
#metadata.combined <- merge(metadata.combined, cluster_edge_betweenness, by.x="MAG_ID", by.y="MAG_ID")
# Join existing metadata with centrality measures results
metadata.combined <- merge(metadata.combined, network.centrality, by.x="MAG_ID", by.y="MAG_ID")
# Write metadata to file.
write.table(metadata.combined, file=output.nodeinfo, quote=FALSE, sep="\t", row.names=FALSE)
Plot heatmap of TMP values (Z-scaled) using a dendrogram derived from the correlation values used for network analysis.
par(cex.main=0.7, cex.lab=0.9, cex.axis=0.9) # Change the font size of the legend title,labels and axis
# Get dendrogram of groups (from clustering of correlation values)
hclust_tree.dend <- as.dendrogram(hclust_tree)
plot(as.phylo(hclust_tree), cex = 0.05)
# Reformat scladded data used for correlation analysis
t.data.tmp <- t.data %>% t() %>% as.matrix()
#
# Color by metadata groups
#
Rcol <- metadata.combined[match(rownames(t.data.tmp), metadata.combined$MAG_ID), ]$Label_color
heatmap.2(t.data.tmp,
Rowv=hclust_tree.dend,
main="TPM z-score scaled rows - network dendrogram",
Colv=FALSE,
dendrogram="row",
col=viridis,
trace="none", # Draw "trace" line
key=TRUE, # Show color-key
margins=c(8,8), # Numeric vector of length 2 containing the margins for column and row names, respectively.
offsetRow=0,
offsetCol=0,
cexRow=0.1, # Positive numbers, used as cex.axis in for the row or column axis labeling.
cexCol=0.4,
#cellnote=formatC(t.data.tmp, big.mark=",", format = "fg"),
notecex=0.2,
RowSideColors=Rcol,
)
heatmap.2(t.data.tmp,
#Rowv=hclust_tree.dend,
main="TPM z-score scaled rows - scaled value dendrogram",
Colv=FALSE,
dendrogram="row",
col=viridis,
trace="none", # Draw "trace" line
key=TRUE, # Show color-key
margins=c(8,8), # Numeric vector of length 2 containing the margins for column and row names, respectively.
offsetRow=0,
offsetCol=0,
cexRow=0.1, # Positive numbers, used as cex.axis in for the row or column axis labeling.
cexCol=0.4,
#cellnote=formatC(t.data.tmp, big.mark=",", format = "fg"),
notecex=0.2,
RowSideColors=Rcol,
)
#
# Color by dendrogram modules
#
Rcol <- metadata.combined[match(rownames(t.data.tmp), metadata.combined$MAG_ID), ]$module_labels_colors
heatmap.2(t.data.tmp,
Rowv=hclust_tree.dend,
main="TPM z-score scaled rows - network dendrogram",
Colv=FALSE,
dendrogram="row",
col=viridis,
trace="none", # Draw "trace" line
key=TRUE, # Show color-key
margins=c(8,8), # Numeric vector of length 2 containing the margins for column and row names, respectively.
offsetRow=0,
offsetCol=0,
cexRow=0.1, # Positive numbers, used as cex.axis in for the row or column axis labeling.
cexCol=0.4,
#cellnote=formatC(t.data.tmp, big.mark=",", format = "fg"),
notecex=0.2,
RowSideColors=Rcol,
)
heatmap.2(t.data.tmp,
#Rowv=hclust_tree.dend,
main="TPM z-score scaled rows - scaled value dendrogram",
Colv=FALSE,
dendrogram="row",
col=viridis,
trace="none", # Draw "trace" line
key=TRUE, # Show color-key
margins=c(8,8), # Numeric vector of length 2 containing the margins for column and row names, respectively.
offsetRow=0,
offsetCol=0,
cexRow=0.1, # Positive numbers, used as cex.axis in for the row or column axis labeling.
cexCol=0.4,
#cellnote=formatC(t.data.tmp, big.mark=",", format = "fg"),
notecex=0.2,
RowSideColors=Rcol,
)
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.2.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices datasets utils methods base
##
## other attached packages:
## [1] viridis_0.6.5 viridisLite_0.4.2 gplots_3.2.0
## [4] ape_5.8 igraph_2.1.1 dynamicTreeCut_1.63-1
## [7] DGCA_1.0.3 maditr_0.8.4 lubridate_1.9.3
## [10] forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4
## [13] purrr_1.0.2 readr_2.1.5 tidyr_1.3.1
## [16] tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] DBI_1.2.3 bitops_1.0-9 gridExtra_2.3
## [4] rlang_1.1.4 magrittr_2.0.3 matrixStats_1.4.1
## [7] compiler_4.3.1 RSQLite_2.3.7 png_0.1-8
## [10] vctrs_0.6.5 pkgconfig_2.0.3 crayon_1.5.3
## [13] fastmap_1.2.0 backports_1.5.0 XVector_0.42.0
## [16] caTools_1.18.3 utf8_1.2.4 rmarkdown_2.29
## [19] tzdb_0.4.0 preprocessCore_1.64.0 bit_4.5.0
## [22] xfun_0.49 zlibbioc_1.48.2 cachem_1.1.0
## [25] GenomeInfoDb_1.38.8 jsonlite_1.8.9 blob_1.2.4
## [28] parallel_4.3.1 cluster_2.1.6 R6_2.5.1
## [31] bslib_0.8.0 stringi_1.8.4 rpart_4.1.23
## [34] jquerylib_0.1.4 Rcpp_1.0.13-1 iterators_1.0.14
## [37] knitr_1.49 WGCNA_1.73 base64enc_0.1-3
## [40] IRanges_2.36.0 Matrix_1.6-5 splines_4.3.1
## [43] nnet_7.3-19 timechange_0.3.0 tidyselect_1.2.1
## [46] rstudioapi_0.17.1 yaml_2.3.10 doParallel_1.0.17
## [49] codetools_0.2-20 lattice_0.22-6 Biobase_2.62.0
## [52] withr_3.0.2 KEGGREST_1.42.0 evaluate_1.0.1
## [55] foreign_0.8-87 survival_3.7-0 Biostrings_2.70.3
## [58] pillar_1.9.0 KernSmooth_2.23-24 checkmate_2.3.2
## [61] renv_1.0.11 foreach_1.5.2 stats4_4.3.1
## [64] generics_0.1.3 RCurl_1.98-1.16 S4Vectors_0.40.2
## [67] hms_1.1.3 munsell_0.5.1 scales_1.3.0
## [70] gtools_3.9.5 glue_1.8.0 Hmisc_5.2-0
## [73] tools_4.3.1 data.table_1.16.2 fastcluster_1.2.6
## [76] grid_4.3.1 impute_1.76.0 AnnotationDbi_1.64.1
## [79] colorspace_2.1-1 nlme_3.1-166 GenomeInfoDbData_1.2.11
## [82] htmlTable_2.4.3 Formula_1.2-5 cli_3.6.3
## [85] fansi_1.0.6 gtable_0.3.6 sass_0.4.9
## [88] digest_0.6.37 BiocGenerics_0.48.1 htmlwidgets_1.6.4
## [91] memoise_2.0.1 htmltools_0.5.8.1 lifecycle_1.0.4
## [94] httr_1.4.7 GO.db_3.18.0 bit64_4.5.2